Author:Zongcheng Li
Reviewer:Ying Ge、Junyi Shen
单细胞CNV的计算和画图。Calculation and mapping of single-cell CNVs
Figure 1. Characterizing Intra-tumoral Expression Heterogeneity in HNSCC by Single-Cell RNA-Seq. (B) Heatmap shows large-scale CNVs for individual cells (rows) from a representative tumor (MEEI5), inferred based on the average expression of 100 genes surrounding each chromosomal position (columns). Red: amplifications; blue: deletions.
出自Fromhttps://www.cell.com/cell/fulltext/S0092-8674(17)31270-9
其他文章里类似的图Similar pictures in other articles:
Fig. 1. Dissection of melanoma with single-cell RNA-seq. (B) Chromosomal landscape of inferred large-scale CNVs allows us to distinguish malignant from nonmalignant cells. The Mel80 tumor is shown with individual cells (y axis) and chromosomal regions (x axis). Amplifications (red) or deletions (blue) were inferred by averaging expression over 100-gene stretches on the respective chromosomes.
用单细胞RNA-seq数据计算CNV,对比展示多组之间的差异。CNV was calculated using single-cell RNA-seq data to compare and show differences between multiple groups
使用国内镜像安装包Use the domestic mirror installation package
options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
加载包Load the package
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 3.6.1
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.6.1
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.6.1
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
##
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
##
## Attaching package: 'caTools'
## The following object is masked from 'package:IRanges':
##
## runmean
## The following object is masked from 'package:S4Vectors':
##
## runmean
library(pheatmap)
Sys.setenv(LANGUAGE = "en") #显示英文报错信息Displays the error message in English
options(stringsAsFactors = FALSE) #禁止chr转成factor Prohibiting CHR from converting to factor
自定义函数Custom functions
minmax <- function(x, min, max ){
x[x > max] <- max
x[x < min] <- min
return(x)
}
GSE103322_HNSCC_all_data.txt.gz,这里以原文的数据为例,其他单细胞RNA-seq数据与此类似,包含表达矩阵和样品分组等信息Here is an example of the original data, and other single-cell RNA-seq data are similar, including information such as expression matrix and sample grouping meta data。
Download GSE103322_HNSCC_all_data.txt.gz from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103322,已上传到微云,下载链接It has been uploaded to Weiyun, download linkhttps://share.weiyun.com/5gsuteR
# read cell annotation information
annodata <- as.data.frame(t(
read.table(file = gzfile("GSE103322_HNSCC_all_data.txt.gz"),
sep = "\t", header = T, row.names = 1, nrows = 5)
))
annodata[1:3,1:3]
## processed by Maxima enzyme Lymph node
## HN28_P15_D06_S330_comb 1 1
## HN28_P6_G05_S173_comb 1 0
## HN26_P14_D11_S239_comb 1 1
## classified as cancer cell
## HN28_P15_D06_S330_comb 0
## HN28_P6_G05_S173_comb 0
## HN26_P14_D11_S239_comb 1
# authors did not provide tumor ID for each single cell? horriable...
tmp <- reshape2::colsplit(gsub(pattern = "HNSCC_17", replacement = "HNSCC17", x = rownames(annodata)),
"_", names = letters[1:10]) #
annodata$tumor <- gsub(pattern = "HN|HNSCC", replacement = "MEEI", tmp$a, ignore.case = T)
head(tmp)
## a b c d e f g h i j
## 1 HN28 P15 D06 S330 comb NA NA NA
## 2 HN28 P6 G05 S173 comb NA NA NA
## 3 HN26 P14 D11 S239 comb NA NA NA
## 4 HN26 P14 H05 S281 comb NA NA NA
## 5 HN26 P25 H09 S189 comb NA NA NA
## 6 HN26 P14 H06 S282 comb NA NA NA
# read normalized expression values
exprdata <- read.table(file = gzfile("GSE103322_HNSCC_all_data.txt.gz"),
sep = "\t", header = T, row.names = 1, skip = 5)
colnames(exprdata) <- rownames(annodata)
exprdata[1:3, 1:3]
## HN28_P15_D06_S330_comb HN28_P6_G05_S173_comb HN26_P14_D11_S239_comb
## C9orf152 0.0000 0.0000 0.42761
## RPS11 6.0037 7.3006 7.28850
## ELMO2 0.0000 0.0000 0.00000
# you may save variables into a Rdata file for quicker data reloading.
save(exprdata, annodata, file = "data.Rdata")
Load and preprocess data
We used the remaining cells (k = 5,902) to identify genes that are expressed at high or intermediate levels by calculating the aggregate
expression of each gene i across the k cells, as Ea(i) = log2(average(TPM(i)1.k)+1), and excluded genes with Ea < 4. For the remaining
cells and genes, we defined relative expression by centering the expression levels, Eri,j = Ei,j-average[Ei,1.k]. The relative
expression levels, across the remaining subset of cells and genes, were used for downstream analysis.
varList <- load(file = "data.Rdata")
varList
## [1] "exprdata" "annodata"
# all cells are high quality with more than 2000 genes in individual cell.
annodata$gene_number <- colSums(exprdata > 0)
# filter genes with high or intermediate expression levels
# Note: This step of filtering out lowly expressed genes is and necessary, otherwise the CNV estimation result may de distorted.
tpmdata <- 10*(2^exprdata-1)
gene_average <- apply(tpmdata, 1, function(x){ log2(mean(x)+1)})
gene_mask <- gene_average > 4
sum(gene_mask)/length(gene_mask)
## [1] 0.3113231
exprdata <- exprdata[gene_mask,]
# sort genes by genomic location
gene_loc <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(exprdata),
columns = c( "CHRLOC"),
keytype = "SYMBOL")
## Warning in .deprecatedColsMessage(): Accessing gene location information via 'CHR','CHRLOC','CHRLOCEND' is
## deprecated. Please use a range based accessor like genes(), or select()
## with columns values like TXCHROM and TXSTART on a TxDb or OrganismDb
## object instead.
## 'select()' returned 1:many mapping between keys and columns
chr_used <- c(as.character(1:22),"X")
gene_loc %<>%
dplyr::filter(CHRLOCCHR %in% chr_used) %<>% # filter out scaffold
dplyr::mutate(chr = factor(CHRLOCCHR, levels = chr_used)) %<>% # set levels for chr
dplyr::arrange(chr, abs(CHRLOC)) # sort genes by genomic location
gene_loc_uniq <- gene_loc[!duplicated(gene_loc$SYMBOL),] # gene deduplication
# set row-order of exprdata
exprdata <- exprdata[gene_loc_uniq$SYMBOL, ]
# get relative expression by centering (note: no scaling)
reladata <- sweep(exprdata, 1, rowMeans(exprdata))
# bound data into [-3, 3]
reladata <- minmax(reladata, -3, 3)
window_length <- 100
# initial CNVs
initial_cnv <- NA + reladata # genes with location information were used for downstream analyses
for (chr in chr_used) {
chr_genes = gene_loc_uniq$SYMBOL[gene_loc_uniq$chr == chr]
chr_data = reladata[chr_genes, , drop = FALSE]
if (nrow(chr_data) > 1) {
chr_data = apply(chr_data, 2, caTools::runmean, k = window_length)
initial_cnv[chr_genes, ] <- chr_data
}else{
print(chr)
}
}
which(is.na(initial_cnv)) # check inistal_cnv
## integer(0)
# re-centering data across chromosome after smoothing, see (Patel et al., 2014)
initial_cnv <- sweep(initial_cnv, 2, apply(initial_cnv, 2, median), FUN = "-")
# initial CNV score of each single-cell
initial_cnv_score <- colMeans(initial_cnv^2)
# initial CNV correlation score
cell_in_which_tumor <- annodata$tumor
initial_cnv_score_tumor_profile <- sapply(unique(cell_in_which_tumor), function(x){
rowMeans(initial_cnv[, cell_in_which_tumor == x])
})
initial_cnv_corr <- sapply(colnames(initial_cnv), function(x) {
cor(x = as.numeric(initial_cnv[, x, drop = T]), y = initial_cnv_score_tumor_profile[,annodata[x,"tumor"]])
})
initial_cnv_score_threshold <- 0.05
initial_cnv_corr_threshold <- 0.5
# putative maglignant cells
initital_putative_maglignant <- names(which(
initial_cnv_score > initial_cnv_score_threshold &
initial_cnv_corr > initial_cnv_corr_threshold
))
# putative non-maglignant cells
initital_putative_non_maglignant <- names(which(
initial_cnv_score < initial_cnv_score_threshold &
initial_cnv_corr < initial_cnv_corr_threshold
))
table(annodata[initital_putative_maglignant, "classified as cancer cell"])
##
## 0 1
## 1 1323
table(annodata[initital_putative_non_maglignant, "classified as non-cancer cells"])
##
## 0 1
## 746 3008
annodata_base <- annodata[initital_putative_non_maglignant, c("non-cancer cell type"), drop = F]
table(annodata_base$`non-cancer cell type`)
##
## -Fibroblast 0 B cell Dendritic Endothelial Fibroblast
## 11 746 125 34 244 1328
## Macrophage Mast myocyte T cell
## 97 117 19 1033
types_used <- c("B cell", "Dendritic", "Endothelial", "Fibroblast", "Macrophage", "Mast", "myocyte", "T cell")
annodata_base <- annodata_base[annodata_base$`non-cancer cell type` %in% types_used,,drop = F]
# baseline
baseline_cnv <- as.matrix(t(apply(initial_cnv[,rownames(annodata_base)], 1, function(x) {
tapply(x, annodata_base$`non-cancer cell type`, mean)
})))
baseline_max <- matrix(rowMax(baseline_cnv),
nrow = nrow(initial_cnv),
ncol = ncol(initial_cnv),
dimnames = dimnames(initial_cnv))
baseline_min <- matrix(rowMin(baseline_cnv),
nrow = nrow(initial_cnv),
ncol = ncol(initial_cnv),
dimnames = dimnames(initial_cnv))
# final CNVs
final_cnv <- 0* initial_cnv
final_cnv[initial_cnv > baseline_max + 0.2] <- (initial_cnv - baseline_max)[initial_cnv > baseline_max + 0.2]
final_cnv[initial_cnv < baseline_min - 0.2] <- (initial_cnv - baseline_min)[initial_cnv < baseline_min - 0.2]
# re-centering data across chromosome after smoothing
# final_cnv <- sweep(final_cnv, 2, apply(final_cnv, 2, median), FUN = "-")
用pheatmap画图Draw with pheatmap
annodata$type <- factor(ifelse(annodata$`classified as non-cancer cells` == 1,
"Non-malignant",
ifelse(annodata$`classified as cancer cell` == 1 & annodata$`Lymph node` == 0,
"Malignant (primary)",
"Malignant (LN)"
)),
levels = c("Non-malignant", "Malignant (primary)","Malignant (LN)"))
phAnno <- annodata[annodata$tumor == "MEEI5",]
phAnno <- phAnno[order(phAnno$type),]
phData <- t(final_cnv[,rownames(phAnno)])
phData <- minmax(phData, min = -1, max = 1)
pheatmap(phData,
color = colorRampPalette(c("darkblue", "blue", "grey90", "red", "red4"),
interpolate = "linear")(11),
# breaks = c(seq(-1, -0.1, length.out = 50),
# seq(0.1, 1, length.out = 50)),
annotation_row = phAnno[, "type", drop = F],
gaps_col = cumsum(table(gene_loc_uniq$chr)),
cluster_rows = F, cluster_cols = F,
show_colnames = F, show_rownames = F,
filename = "scCNV.pdf")